library(tidyverse)
library(tidymodels)
library(solitude) # -- new package
library(janitor)
library(ggpubr)
library(skimr)
library(themis)
library(dplyr)
library(vip)
library(DALEX) # new
library(DALEXtra) # new
library(rpart)
library(rpart.plot)
loan <- read_csv("loan_train.csv") %>%
clean_names()
Rows: 29777 Columns: 52── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (23): term, int_rate, grade, sub_grade, emp_title, emp_length, home_ownership, verification_status, issue_d, loan_status, ...
dbl (29): id, member_id, loan_amnt, funded_amnt, funded_amnt_inv, installment, annual_inc, dti, delinq_2yrs, fico_range_low, f...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(loan)
skim(loan)
── Data Summary ────────────────────────
Values
Name loan
Number of rows 29777
Number of columns 52
_______________________
Column type frequency:
character 23
numeric 29
________________________
Group variables None
kaggle <- read_csv("loan_holdout.csv") %>% clean_names()
Rows: 12761 Columns: 51── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (22): term, int_rate, grade, sub_grade, emp_title, emp_length, home_ownership, verification_status, issue_d, pymnt_plan, u...
dbl (29): id, member_id, loan_amnt, funded_amnt, funded_amnt_inv, installment, annual_inc, dti, delinq_2yrs, fico_range_low, f...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skim(kaggle)
── Data Summary ────────────────────────
Values
Name kaggle
Number of rows 12761
Number of columns 51
_______________________
Column type frequency:
character 22
numeric 29
________________________
Group variables None
n_cols <- names(loan %>% select_if(is.numeric) %>% select(-id,-member_id))
my_hist <- function(col){
loan %>%
summarise(n=n(),
n_miss = sum(is.na(!!as.name(col))),
n_dist = n_distinct(!!as.name(col)),
mean = round(mean(!!as.name(col), na.rm=TRUE),2),
min = min(!!as.name(col), na.rm=TRUE),
max = max(!!as.name(col), na.rm=TRUE)
) -> col_summary
p1 <- ggtexttable(col_summary, rows = NULL,
theme = ttheme("mOrange"))
h1 <- loan %>%
ggplot(aes(x=!!as.name(col))) +
geom_histogram(bins=30)
plt <- ggarrange( h1, p1,
ncol = 1, nrow = 2,
heights = c(1, 0.3))
print(plt)
}
for (c in n_cols){
my_hist(c)
}
Warning: Removed 3 rows containing non-finite values (stat_bin).
loan_summary <- loan %>%
count(loan_status) %>%
mutate(pct = n/sum(n))
loan_summary %>%
ggplot(aes(x=factor(loan_status),y=pct)) +
geom_col() +
geom_text(aes(label = round(pct*100,2)) , vjust = 2.5, colour = "white") +
labs(title="Loan Status Plot", x="Loan_Status", y="PCT")
NA
NA
loan_vis <- loan %>%
mutate_if(is.character, factor)
for (c in names(loan_vis %>% dplyr::select(!c(id,member_id)))) {
if (c == "event_timestamp") {
# print( fraud_vis %>%
#ggplot(., aes(!!as.name(c))) +
#geom_histogram(aes(bins=10,fill = loan_status), position = "fill") +labs(title = c, y = "pct fraud"))
}else if (c %in% names(loan_vis %>% dplyr::select(where(is.factor)))) {
# -- for each character column create a chart
print( loan_vis %>%
ggplot(., aes(!!as.name(c))) +
geom_bar(aes(fill = loan_status), position = "fill") + labs(title = c, y = "pct fraud"))
} else {
# -- comparative boxplots
print(ggplot(loan_vis, aes(x=loan_status, y=!!as.name(c), fill=loan_status))+ geom_boxplot() +labs(title = c))
}
}
[WARNING] Deprecated: --self-contained. use --embed-resources --standalone
[WARNING] Deprecated: --self-contained. use --embed-resources --standalone
[WARNING] Deprecated: --self-contained. use --embed-resources --standalone
#correlation
library(reshape2)
loan_numeric <- subset(loan,select = -c(id,member_id)) %>%
select_if(.,is.numeric)
cor_mat <- loan_numeric %>%
cor()
cor_melt <- cor_mat %>% melt
cor_melt %>%
mutate(value = round(value,2)) %>%
ggplot(aes(Var2, Var1, fill = value))+
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 4, hjust = 1),axis.text.y = element_text(angle = 45, vjust = 1,
size = 4, hjust = 1))+
coord_fixed() +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 1.5) +
labs(title = "Pearson Correlation for Numerical Data")
# deal w. categoricals
loan_recipe <- recipe(~.,loan) %>%
step_rm(id,member_id,int_rate,emp_title,url,desc,title,zip_code,earliest_cr_line,revol_util,mths_since_last_delinq,mths_since_last_record,next_pymnt_d) %>%
step_unknown(all_nominal_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
bake_loan <- bake(loan_recipe, loan)
iso_forest <- isolationForest$new(
sample_size = 1000,
num_trees = 100,
max_depth = ceiling(log2(1000)))
iso_forest$fit(bake_loan)
INFO [17:34:53.812] dataset has duplicated rows
INFO [17:34:53.817] Building Isolation Forest ...
INFO [17:34:56.876] done
INFO [17:34:56.879] Computing depth of terminal nodes ...
INFO [17:34:57.094] done
INFO [17:34:58.301] Completed growing isolation forest
evaluate histogram pick a value of average_depth to identify anomalies. a shorter average depth means the point is more isolated and more likely an anomaly
pred_train <- iso_forest$predict(bake_loan)
pred_train %>%
ggplot(aes(average_depth)) +
geom_histogram(bins=20) +
geom_vline(xintercept = 9.45, linetype="dotted",
color = "blue", size=1.5) +
labs(title="Isolation Forest Average Tree Depth")
pred_train %>%
ggplot(aes(anomaly_score)) +
geom_histogram(bins=20) +
geom_vline(xintercept = 0.6, linetype="dotted",
color = "blue", size=1.5) +
labs(title="Isolation Forest Anomaly Score Above 0.6")
NA
NA
The steps of interpreting anomalies on a global level are:
train_pred <- bind_cols(iso_forest$predict(bake_loan),bake_loan) %>%
mutate(anomaly = as.factor(if_else(average_depth <= 9.45, "Anomaly","Normal")))
train_pred %>%
arrange(average_depth) %>%
count(anomaly)
NA
fmla <- as.formula(paste("anomaly ~ ", paste(bake_loan %>% colnames(), collapse= "+")))
outlier_tree <- decision_tree(min_n=2, tree_depth=3, cost_complexity = .01) %>%
set_mode("classification") %>%
set_engine("rpart") %>%
fit(fmla, data=train_pred)
outlier_tree$fit
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 6 Normal (0.0002014978 0.9997985022)
2) last_pymnt_amnt>=34166.69 14 1 Normal (0.0714285714 0.9285714286)
4) open_acc>=24 1 0 Anomaly (1.0000000000 0.0000000000) *
5) open_acc< 24 13 0 Normal (0.0000000000 1.0000000000) *
3) last_pymnt_amnt< 34166.69 29763 5 Normal (0.0001679938 0.9998320062)
6) revol_bal>=214867 45 1 Normal (0.0222222222 0.9777777778)
12) revol_bal< 215273.5 1 0 Anomaly (1.0000000000 0.0000000000) *
13) revol_bal>=215273.5 44 0 Normal (0.0000000000 1.0000000000) *
7) revol_bal< 214867 29718 4 Normal (0.0001345986 0.9998654014) *
library(rpart.plot) # -- plotting decision trees
rpart.plot(outlier_tree$fit,clip.right.labs = FALSE, branch = .3, under = TRUE, roundint=FALSE, extra=3)
anomaly_rules <- rpart.rules(outlier_tree$fit,roundint=FALSE, extra = 4, cover = TRUE, clip.facs = TRUE) %>% clean_names() %>%
#filter(anomaly=="Anomaly") %>%
mutate(rule = "IF")
rule_cols <- anomaly_rules %>% select(starts_with("x_")) %>% colnames()
for (col in rule_cols){
anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
anomaly_rules %>%
as.data.frame() %>%
filter(anomaly == "Anomaly") %>%
mutate(rule = paste(rule, " THEN ", anomaly )) %>%
mutate(rule = paste(rule," coverage ", cover)) %>%
select( rule)
anomaly_rules %>%
as.data.frame() %>%
filter(anomaly == "Normal") %>%
mutate(rule = paste(rule, " THEN ", anomaly )) %>%
mutate(rule = paste(rule," coverage ", cover)) %>%
select( rule)
pred_train <- bind_cols(iso_forest$predict(bake_loan),
bake_loan)
pred_train %>%
arrange(desc(anomaly_score) ) %>%
filter(average_depth <= 9.45)
fmla <- as.formula(paste("anomaly ~ ", paste(bake_loan %>% colnames(), collapse= "+")))
pred_train %>%
mutate(anomaly= as.factor(if_else(id==28576, "Anomaly", "Normal"))) -> local_df
local_tree <- decision_tree(mode="classification",
tree_depth = 5,
min_n = 1,
cost_complexity=0) %>%
set_engine("rpart") %>%
fit(fmla,local_df )
local_tree$fit
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) addr_state_SD>=0.5 48 1 Normal (2.083333e-02 9.791667e-01)
4) annual_inc>=102052 1 0 Anomaly (1.000000e+00 0.000000e+00) *
5) annual_inc< 102052 47 0 Normal (0.000000e+00 1.000000e+00) *
3) addr_state_SD< 0.5 29729 0 Normal (0.000000e+00 1.000000e+00) *
rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE, roundint=FALSE)
rpart.plot(local_tree$fit, roundint=FALSE, extra=3)
anomaly_rules <- rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE) %>% clean_names() %>%
filter(anomaly=="Anomaly") %>%
mutate(rule = "IF")
Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
To silence this warning:
Call rpart.rules with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rule_cols <- anomaly_rules %>% select(starts_with("x_")) %>% colnames()
for (col in rule_cols){
anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
as.data.frame(anomaly_rules) %>%
select(rule, cover)
local_df %>%
filter(addr_state_SD >=0.5) %>%
filter(annual_inc >=102052) %>%
summarise(n=n(),
mean_annual_inc = mean(annual_inc))
local_explainer <- function(ID){
fmla <- as.formula(paste("anomaly ~ ", paste(bake_loan %>% colnames(), collapse= "+")))
pred_train %>%
mutate(anomaly= as.factor(if_else(id==ID, "Anomaly", "Normal"))) -> local_df
local_tree <- decision_tree(mode="classification",
tree_depth = 3,
min_n = 1,
cost_complexity=0) %>%
set_engine("rpart") %>%
fit(fmla,local_df )
local_tree$fit
#rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE)
rpart.plot(local_tree$fit, roundint=FALSE, extra=3) %>% print()
anomaly_rules <- rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE) %>% clean_names() %>%
filter(anomaly=="Anomaly") %>%
mutate(rule = "IF")
rule_cols <- anomaly_rules %>% select(starts_with("x_")) %>% colnames()
for (col in rule_cols){
anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
as.data.frame(anomaly_rules) %>%
select(rule, cover) %>%
print()
}
pred_train %>%
filter(average_depth <=9.45) %>%
pull(id) -> anomaly_vect
for (anomaly_id in anomaly_vect){
#print(anomaly_id)
local_explainer(anomaly_id)
}
$obj
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) last_pymnt_amnt>=34166.69 14 1 Normal (7.142857e-02 9.285714e-01)
4) open_acc>=24 1 0 Anomaly (1.000000e+00 0.000000e+00) *
5) open_acc< 24 13 0 Normal (0.000000e+00 1.000000e+00) *
3) last_pymnt_amnt< 34166.69 29763 0 Normal (0.000000e+00 1.000000e+00) *
$snipped.nodes
NULL
$xlim
[1] 0 1
$ylim
[1] 0 1
$x
[1] 0.60786912 0.27647172 0.05554013 0.49740332 0.93926651
$y
[1] 0.92241378 0.52076802 0.03879311 0.03879311 0.03879311
$branch.x
[,1] [,2] [,3] [,4] [,5]
x 0.6078691 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.6078691 0.27647172 0.2764717 0.6078691
$branch.y
[,1] [,2] [,3] [,4] [,5]
y 1.001438 0.5997924 0.1178175 0.1178175 0.1178175
NA 0.8310967 0.4294509 0.4294509 0.8310967
NA 0.8310967 0.4294509 0.4294509 0.8310967
$labs
[1] "Normal\n1 / 29777" "Normal\n1 / 14" "Anomaly\n0 / 1" "Normal\n0 / 13" "Normal\n0 / 29763"
$cex
[1] 1
$boxes
$boxes$x1
[1] 0.541919895 0.224887678 -0.004532432 0.445819274 0.873317289
$boxes$y1
[1] 0.878511331 0.476865573 -0.005109338 -0.005109338 -0.005109338
$boxes$x2
[1] 0.6738183 0.3280558 0.1156127 0.5489874 1.0052157
$boxes$y2
[1] 1.0014382 0.5997924 0.1178175 0.1178175 0.1178175
$split.labs
[1] ""
$split.cex
[1] 1 1 1 1 1
$split.box
$split.box$x1
[1] 0.4309163 0.1680799 NA NA NA
$split.box$y1
[1] 0.7959747 0.3943290 NA NA NA
$split.box$x2
[1] 0.7848220 0.3848635 NA NA NA
$split.box$y2
[1] 0.8662186 0.4645729 NA NA NA
$obj
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) sub_grade_G1>=0.5 102 1 Normal (9.803922e-03 9.901961e-01)
4) purpose_medical>=0.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
5) purpose_medical< 0.5 101 0 Normal (0.000000e+00 1.000000e+00) *
3) sub_grade_G1< 0.5 29675 0 Normal (0.000000e+00 1.000000e+00) *
$snipped.nodes
NULL
$xlim
[1] 0 1
$ylim
[1] 0 1
$x
[1] 0.60786912 0.27647172 0.05554013 0.49740332 0.93926651
$y
[1] 0.92241378 0.52076802 0.03879311 0.03879311 0.03879311
$branch.x
[,1] [,2] [,3] [,4] [,5]
x 0.6078691 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.6078691 0.27647172 0.2764717 0.6078691
$branch.y
[,1] [,2] [,3] [,4] [,5]
y 1.001438 0.5997924 0.1178175 0.1178175 0.1178175
NA 0.8310967 0.4294509 0.4294509 0.8310967
NA 0.8310967 0.4294509 0.4294509 0.8310967
$labs
[1] "Normal\n1 / 29777" "Normal\n1 / 102" "Anomaly\n0 / 1" "Normal\n0 / 101" "Normal\n0 / 29675"
$cex
[1] 1
$boxes
$boxes$x1
[1] 0.541919895 0.224887678 -0.004532432 0.445819274 0.873317289
$boxes$y1
[1] 0.878511331 0.476865573 -0.005109338 -0.005109338 -0.005109338
$boxes$x2
[1] 0.6738183 0.3280558 0.1156127 0.5489874 1.0052157
$boxes$y2
[1] 1.0014382 0.5997924 0.1178175 0.1178175 0.1178175
$split.labs
[1] ""
$split.cex
[1] 1 1 1 1 1
$split.box
$split.box$x1
[1] 0.4648703 0.1184548 NA NA NA
$split.box$y1
[1] 0.7959747 0.3943290 NA NA NA
$split.box$x2
[1] 0.7508679 0.4344887 NA NA NA
$split.box$y2
[1] 0.8662186 0.4645729 NA NA NA
$obj
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) last_credit_pull_d_Jun.13>=0.5 224 1 Normal (4.464286e-03 9.955357e-01)
4) sub_grade_C2>=0.5 8 1 Normal (1.250000e-01 8.750000e-01)
8) fico_range_low>=705 1 0 Anomaly (1.000000e+00 0.000000e+00) *
9) fico_range_low< 705 7 0 Normal (0.000000e+00 1.000000e+00) *
5) sub_grade_C2< 0.5 216 0 Normal (0.000000e+00 1.000000e+00) *
3) last_credit_pull_d_Jun.13< 0.5 29553 0 Normal (0.000000e+00 1.000000e+00) *
$snipped.nodes
NULL
$xlim
[1] 0 1
$ylim
[1] 0 1
$x
[1] 0.68151298 0.42375945 0.20282786 0.05554013 0.35011559 0.64469105 0.93926651
$y
[1] 0.92241378 0.65464994 0.38688610 0.03879311 0.03879311 0.03879311 0.03879311
$branch.x
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
x 0.681513 0.4237595 0.2028279 0.05554013 0.3501156 0.6446910 0.9392665
NA 0.4237595 0.2028279 0.05554013 0.3501156 0.6446910 0.9392665
NA 0.6815130 0.4237595 0.20282786 0.2028279 0.4237595 0.6815130
$branch.y
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
y 1.001438 0.7336743 0.4659105 0.1178175 0.1178175 0.1178175 0.1178175
NA 0.8310967 0.5633328 0.2955690 0.2955690 0.5633328 0.8310967
NA 0.8310967 0.5633328 0.2955690 0.2955690 0.5633328 0.8310967
$labs
[1] "Normal\n1 / 29777" "Normal\n1 / 224" "Normal\n1 / 8" "Anomaly\n0 / 1" "Normal\n0 / 7" "Normal\n0 / 216"
[7] "Normal\n0 / 29553"
$cex
[1] 1
$boxes
$boxes$x1
[1] 0.615563760 0.372175408 0.151243812 -0.004532432 0.298531543 0.593107005 0.873317289
$boxes$y1
[1] 0.878511331 0.610747492 0.342983653 -0.005109338 -0.005109338 -0.005109338 -0.005109338
$boxes$x2
[1] 0.7474622 0.4753435 0.2544119 0.1156127 0.4016996 0.6962751 1.0052157
$boxes$y2
[1] 1.0014382 0.7336743 0.4659105 0.1178175 0.1178175 0.1178175 0.1178175
$split.labs
[1] ""
$split.cex
[1] 1 1 1 1 1 1 1
$split.box
$split.box$x1
[1] 0.47125903 0.28206657 0.05329942 NA NA NA NA
$split.box$y1
[1] 0.7959747 0.5282109 0.2604471 NA NA NA NA
$split.box$x2
[1] 0.8917669 0.5654523 0.3523563 NA NA NA NA
$split.box$y2
[1] 0.8662186 0.5984548 0.3306910 NA NA NA NA
$obj
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) last_pymnt_d_Dec.10>=0.5 233 1 Normal (4.291845e-03 9.957082e-01)
4) funded_amnt_inv>=23591 2 1 Anomaly (5.000000e-01 5.000000e-01)
8) loan_amnt< 24125 1 0 Anomaly (1.000000e+00 0.000000e+00) *
9) loan_amnt>=24125 1 0 Normal (0.000000e+00 1.000000e+00) *
5) funded_amnt_inv< 23591 231 0 Normal (0.000000e+00 1.000000e+00) *
3) last_pymnt_d_Dec.10< 0.5 29544 0 Normal (0.000000e+00 1.000000e+00) *
$snipped.nodes
NULL
$xlim
[1] 0 1
$ylim
[1] 0 1
$x
[1] 0.68151298 0.42375945 0.20282786 0.05554013 0.35011559 0.64469105 0.93926651
$y
[1] 0.92241378 0.65464994 0.38688610 0.03879311 0.03879311 0.03879311 0.03879311
$branch.x
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
x 0.681513 0.4237595 0.2028279 0.05554013 0.3501156 0.6446910 0.9392665
NA 0.4237595 0.2028279 0.05554013 0.3501156 0.6446910 0.9392665
NA 0.6815130 0.4237595 0.20282786 0.2028279 0.4237595 0.6815130
$branch.y
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
y 1.001438 0.7336743 0.4659105 0.1178175 0.1178175 0.1178175 0.1178175
NA 0.8310967 0.5633328 0.2955690 0.2955690 0.5633328 0.8310967
NA 0.8310967 0.5633328 0.2955690 0.2955690 0.5633328 0.8310967
$labs
[1] "Normal\n1 / 29777" "Normal\n1 / 233" "Anomaly\n1 / 2" "Anomaly\n0 / 1" "Normal\n0 / 1" "Normal\n0 / 231"
[7] "Normal\n0 / 29544"
$cex
[1] 1
$boxes
$boxes$x1
[1] 0.615563760 0.372175408 0.142755298 -0.004532432 0.298531543 0.593107005 0.873317289
$boxes$y1
[1] 0.878511331 0.610747492 0.342983653 -0.005109338 -0.005109338 -0.005109338 -0.005109338
$boxes$x2
[1] 0.7474622 0.4753435 0.2629004 0.1156127 0.4016996 0.6962751 1.0052157
$boxes$y2
[1] 1.0014382 0.7336743 0.4659105 0.1178175 0.1178175 0.1178175 0.1178175
$split.labs
[1] ""
$split.cex
[1] 1 1 1 1 1 1 1
$split.box
$split.box$x1
[1] 0.49672457 0.24615363 0.07550015 NA NA NA NA
$split.box$y1
[1] 0.7959747 0.5282109 0.2604471 NA NA NA NA
$split.box$x2
[1] 0.8663014 0.6013653 0.3301556 NA NA NA NA
$split.box$y2
[1] 0.8662186 0.5984548 0.3306910 NA NA NA NA
$obj
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) revol_bal>=214867 45 1 Normal (2.222222e-02 9.777778e-01)
4) revol_bal< 215273.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
5) revol_bal>=215273.5 44 0 Normal (0.000000e+00 1.000000e+00) *
3) revol_bal< 214867 29732 0 Normal (0.000000e+00 1.000000e+00) *
$snipped.nodes
NULL
$xlim
[1] 0 1
$ylim
[1] 0 1
$x
[1] 0.60786912 0.27647172 0.05554013 0.49740332 0.93926651
$y
[1] 0.92241378 0.52076802 0.03879311 0.03879311 0.03879311
$branch.x
[,1] [,2] [,3] [,4] [,5]
x 0.6078691 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.6078691 0.27647172 0.2764717 0.6078691
$branch.y
[,1] [,2] [,3] [,4] [,5]
y 1.001438 0.5997924 0.1178175 0.1178175 0.1178175
NA 0.8310967 0.4294509 0.4294509 0.8310967
NA 0.8310967 0.4294509 0.4294509 0.8310967
$labs
[1] "Normal\n1 / 29777" "Normal\n1 / 45" "Anomaly\n0 / 1" "Normal\n0 / 44" "Normal\n0 / 29732"
$cex
[1] 1
$boxes
$boxes$x1
[1] 0.541919895 0.224887678 -0.004532432 0.445819274 0.873317289
$boxes$y1
[1] 0.878511331 0.476865573 -0.005109338 -0.005109338 -0.005109338
$boxes$x2
[1] 0.6738183 0.3280558 0.1156127 0.5489874 1.0052157
$boxes$y2
[1] 1.0014382 0.5997924 0.1178175 0.1178175 0.1178175
$split.labs
[1] ""
$split.cex
[1] 1 1 1 1 1
$split.box
$split.box$x1
[1] 0.4733588 0.1497970 NA NA NA
$split.box$y1
[1] 0.7959747 0.3943290 NA NA NA
$split.box$x2
[1] 0.7423794 0.4031465 NA NA NA
$split.box$y2
[1] 0.8662186 0.4645729 NA NA NA
$obj
n= 29777
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
2) addr_state_SD>=0.5 48 1 Normal (2.083333e-02 9.791667e-01)
4) annual_inc>=102052 1 0 Anomaly (1.000000e+00 0.000000e+00) *
5) annual_inc< 102052 47 0 Normal (0.000000e+00 1.000000e+00) *
3) addr_state_SD< 0.5 29729 0 Normal (0.000000e+00 1.000000e+00) *
$snipped.nodes
NULL
$xlim
[1] 0 1
$ylim
[1] 0 1
$x
[1] 0.60786912 0.27647172 0.05554013 0.49740332 0.93926651
$y
[1] 0.92241378 0.52076802 0.03879311 0.03879311 0.03879311
$branch.x
[,1] [,2] [,3] [,4] [,5]
x 0.6078691 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.2764717 0.05554013 0.4974033 0.9392665
NA 0.6078691 0.27647172 0.2764717 0.6078691
$branch.y
[,1] [,2] [,3] [,4] [,5]
y 1.001438 0.5997924 0.1178175 0.1178175 0.1178175
NA 0.8310967 0.4294509 0.4294509 0.8310967
NA 0.8310967 0.4294509 0.4294509 0.8310967
$labs
[1] "Normal\n1 / 29777" "Normal\n1 / 48" "Anomaly\n0 / 1" "Normal\n0 / 47" "Normal\n0 / 29729"
$cex
[1] 1
$boxes
$boxes$x1
[1] 0.541919895 0.224887678 -0.004532432 0.445819274 0.873317289
$boxes$y1
[1] 0.878511331 0.476865573 -0.005109338 -0.005109338 -0.005109338
$boxes$x2
[1] 0.6738183 0.3280558 0.1156127 0.5489874 1.0052157
$boxes$y2
[1] 1.0014382 0.5997924 0.1178175 0.1178175 0.1178175
$split.labs
[1] ""
$split.cex
[1] 1 1 1 1 1
$split.box
$split.box$x1
[1] 0.4642173 0.1315140 NA NA NA
$split.box$y1
[1] 0.7959747 0.3943290 NA NA NA
$split.box$x2
[1] 0.7515209 0.4214294 NA NA NA
$split.box$y2
[1] 0.8662186 0.4645729 NA NA NA
#Models
loan <- loan %>%
mutate_if(is.character,as.factor) %>%
mutate(loan_status=case_when(loan_status=="default"~"1",
loan_status=="current"~"0")) %>%
mutate(loan_status = factor(loan_status))
head(loan)
set.seed(08)
train_test_spit<- initial_split(loan, prop = 0.7, strata=loan_status)
train <- training(train_test_spit)
test <- testing(train_test_spit)
sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(loan) * 100)
[1] "Train PCT : 70.00%"
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(loan) * 100)
[1] "Test PCT : 30.00%"
# Kfold cross validation
kfold_splits <- vfold_cv(train, v=5)
# -- define recipe
model_loan_recipe <- recipe(loan_status ~ loan_amnt + funded_amnt+funded_amnt_inv+term+installment+grade+sub_grade+emp_length+home_ownership+annual_inc+verification_status+issue_d+loan_status+pymnt_plan+purpose+addr_state+dti+delinq_2yrs+fico_range_low+fico_range_high+inq_last_6mths+open_acc+pub_rec+revol_bal+total_acc+out_prncp+out_prncp_inv+total_rec_late_fee+last_pymnt_d+last_pymnt_amnt+last_credit_pull_d+collections_12_mths_ex_med+policy_code+application_type+acc_now_delinq+chargeoff_within_12_mths+delinq_amnt+pub_rec_bankruptcies+tax_liens,
data = train) %>%
step_unknown(all_nominal_predictors()) %>%
step_nzv(all_nominal_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
## -- define recipe for an MLP
loan_recipe_nn <- recipe(loan_status ~ loan_amnt + funded_amnt+funded_amnt_inv+term+installment+grade+sub_grade+emp_length+home_ownership+annual_inc+verification_status+issue_d+loan_status+pymnt_plan+purpose+addr_state+dti+delinq_2yrs+fico_range_low+fico_range_high+inq_last_6mths+open_acc+pub_rec+revol_bal+total_acc+out_prncp+out_prncp_inv+total_rec_late_fee+last_pymnt_d+last_pymnt_amnt+last_credit_pull_d+collections_12_mths_ex_med+policy_code+application_type+acc_now_delinq+chargeoff_within_12_mths+delinq_amnt+pub_rec_bankruptcies+tax_liens,
data = train) %>%
step_unknown(all_nominal_predictors()) %>%
themis::step_downsample(loan_status,under_ratio = 1) %>%
step_nzv(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
bake(loan_recipe_nn %>% prep(), train %>% sample_n(1000))
# -- XGB model & workflow
xgb_model <- boost_tree(
trees = 20) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_workflow_fit <- workflow() %>%
add_recipe(model_loan_recipe) %>%
add_model(xgb_model) %>%
fit(train)
[WARNING] Deprecated: --self-contained. use --embed-resources --standalone
# -- RF model & workflow
rf_model <- rand_forest(
trees = 20) %>%
set_engine("ranger",num.threads = 8, importance = "permutation") %>%
set_mode("classification" )
rf_workflow_fit <- workflow() %>%
add_recipe(model_loan_recipe) %>%
add_model(rf_model) %>%
fit(train)
# -- NNet model & workflow
nn_model <- mlp(hidden_units = 10, dropout = 0.01, epochs = 20) %>%
set_engine("nnet", MaxNWts=10240) %>%
set_mode("classification")
nn_workflow_fit <- workflow() %>%
add_recipe(loan_recipe_nn) %>%
add_model(nn_model) %>%
fit(train)
evaluate_models <- function(model_workflow, model_name){
# 1. Make Predictions
score_train <- bind_cols(
predict(model_workflow,train, type="prob"),
predict(model_workflow,train, type="class"),
train) %>%
mutate(part = "train")
score_test <- bind_cols(
predict(model_workflow,test, type="prob"),
predict(model_workflow,test, type="class"),
test) %>%
mutate(part = "test")
options(yardstick.event_first = FALSE)
bind_rows(score_train, score_test) %>%
group_by(part) %>%
metrics(loan_status, .pred_1, estimate=.pred_class) %>%
pivot_wider(id_cols = part, names_from = .metric, values_from = .estimate) %>%
mutate(model_name = model_name) %>% print()
# ROC Curve
bind_rows(score_train, score_test) %>%
group_by(part) %>%
roc_curve(truth=loan_status, predicted=.pred_1) %>%
autoplot() +
geom_vline(xintercept = 0.20,
color = "black",
linetype = "longdash") +
labs(title = model_name, x = "FPR(1 - specificity)", y = "TPR(recall)") -> roc_chart
print(roc_chart)
# Score Distribution
score_test %>%
ggplot(aes(.pred_1,fill=loan_status)) +
geom_histogram(bins=50) +
geom_vline(aes(xintercept=.5, color="red")) +
geom_vline(aes(xintercept=.3, color="green")) +
geom_vline(aes(xintercept=.7, color="blue")) +
labs(title = model_name) -> score_dist
print(score_dist)
# Variable Importance
model_workflow %>%
extract_fit_parsnip() %>%
vip(10) +
labs(model_name) -> vip_model
print(vip_model)
}
evaluate_models(xgb_workflow_fit, "XGB model")
evaluate_models(rf_workflow_fit, "RF model")
evaluate_models(nn_workflow_fit, "NNet model")
scored_test %>%
roc_curve(loan_status,.pred_1) %>%
mutate(fpr = 1 - specificity) %>%
ggplot(aes(x=.threshold,y=sensitivity)) +
geom_line() +
labs(title="Threshold vs TPR", x=".pred_1",y="TPR")
scored_train %>%
conf_mat(loan_status,.pred_class) %>%
autoplot(type = "heatmap") +
labs(title=" training confusion matrix") %>%
print()
$title
[1] " training confusion matrix"
attr(,"class")
[1] "labels"
scored_test %>%
conf_mat(loan_status,.pred_class) %>%
autoplot(type = "heatmap") +
labs(title=" training confusion matrix") %>%
print()
$title
[1] " training confusion matrix"
attr(,"class")
[1] "labels"
xgb_workflow_fit %>%
pull_workflow_fit() %>%
vip(10) +
labs("XGB VIP")
Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
Please use `extract_fit_parsnip()` instead.
rf_workflow_fit %>%
pull_workflow_fit() %>%
vip(10) +
labs("RF VIP")
Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
Please use `extract_fit_parsnip()` instead.
nn_workflow_fit %>%
pull_workflow_fit() %>%
vip(10) +
labs("NN VIP")
Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
Please use `extract_fit_parsnip()` instead.
predict(xgb_workflow_fit, kaggle, type = "prob") %>%
bind_cols(kaggle) %>%
dplyr::select(id,loan_status = .pred_1) %>%
write_csv("my_kaggle.csv")
Warning: Novel levels found in column 'issue_d': 'Dec-2011', 'Nov-2011', 'Oct-2011', 'Sep-2011', 'Aug-2011', 'Jul-2011', 'Jun-2011', 'May-2011', 'Apr-2011', 'Mar-2011', 'Feb-2011', 'Jan-2011', 'Dec-2010', 'Nov-2010', 'Oct-2010', 'Sep-2010', 'Aug-2010', 'Jul-2010', 'Jun-2010', 'May-2010', 'Apr-2010', 'Mar-2010', 'Feb-2010', 'Jan-2010', 'Dec-2009', 'Nov-2009', 'Oct-2009', 'Sep-2009', 'Aug-2009', 'Jul-2009', 'Jun-2009', 'May-2009', 'Apr-2009', 'Mar-2009', 'Feb-2009', 'Jan-2009', 'Dec-2008', 'Nov-2008', 'Oct-2008', 'Sep-2008', 'Aug-2008', 'Jul-2008', 'Jun-2008', 'May-2008', 'Apr-2008', 'Mar-2008', 'Feb-2008', 'Jan-2008', 'Dec-2007', 'Nov-2007', 'Oct-2007', 'Sep-2007', 'Aug-2007', 'Jul-2007', 'Jun-2007'. The levels have been removed, and values have been coerced to 'NA'.Warning: Novel levels found in column 'last_pymnt_d': 'Jun-2014', 'Sep-2016', 'Jan-2015', 'Nov-2012', 'Jul-2012', 'Feb-2015', 'Oct-2013', 'Oct-2012', 'Sep-2012', 'Apr-2013', 'Oct-2014', 'Aug-2012', 'Jul-2013', 'Jan-2016', 'Feb-2016', 'Aug-2013', 'Apr-2015', 'Apr-2014', 'Jun-2012', 'Feb-2013', 'Jun-2013', 'Nov-2013', 'Sep-2014', 'Dec-2014', 'Mar-2013', 'Dec-2013', 'Mar-2014', 'May-2014', 'Nov-2015', 'Jan-2014', 'May-2016', 'Jun-2016', 'Dec-2012', 'Feb-2012', 'Feb-2014', 'Aug-2015', 'Jul-2016', 'Jul-2014', 'Jan-2013', 'May-2013', 'Apr-2012', 'Mar-2016', 'Sep-2013', 'Sep-2015', 'Aug-2014', 'Jun-2015', 'Nov-2014', 'Aug-2016', 'Dec-2015', 'Jul-2015', 'Apr-2016', 'Jan-2012', 'May-2015', 'May-2012', 'Mar-2012', 'Oct-2015', 'Mar-2015', 'Dec-2011', 'Nov-2011', 'Oct-2011', 'Sep-2011', 'Aug-2011', 'Jul-2011', 'Jun-2011', 'May-2011', 'Apr-2011', 'Mar-2011', 'Feb-2011', 'Jan-2011', 'Dec-2010', 'Nov-2010', 'Oct-2010', 'Sep-2010', 'Aug-2010', 'Jul-2010', 'Jun-2010', 'May-2010', 'Apr-2010', 'Mar-2010', 'Feb-2010', 'Jan-2010', 'Dec-2009', 'Nov-2009', 'Oct-2009', 'Aug-2009', 'Jul-2009', 'Sep-2009', 'May-2009', 'Jun-2009', 'Apr-2009', 'Mar-2009', 'Jan-2009', 'Feb-2009', 'Dec-2008', 'Jul-2008', 'Oct-2008', 'Jun-2008', 'Nov-2008', 'Aug-2008', 'Apr-2008', 'May-2008', 'Mar-2008', 'Sep-2008', 'Dec-2007', 'Jan-2008'. The levels have been removed, and values have been coerced to 'NA'.Warning: Novel levels found in column 'last_credit_pull_d': 'Sep-2016', 'Jan-2016', 'Apr-2015', 'Jul-2015', 'Feb-2016', 'Mar-2014', 'Sep-2012', 'Dec-2014', 'Jun-2012', 'Mar-2015', 'Sep-2014', 'Apr-2014', 'Oct-2014', 'Feb-2013', 'Nov-2015', 'Oct-2012', 'Nov-2013', 'Nov-2014', 'Jul-2016', 'Oct-2015', 'Jan-2015', 'Aug-2015', 'Aug-2012', 'Sep-2013', 'Aug-2014', 'Jun-2016', 'Feb-2012', 'Oct-2013', 'Jan-2014', 'Jun-2013', 'Dec-2015', 'Jul-2014', 'Mar-2016', 'Dec-2013', 'Apr-2016', 'Sep-2015', 'Apr-2013', 'Nov-2012', 'May-2014', 'May-2015', 'Jun-2014', 'Jul-2012', 'May-2013', 'Feb-2015', 'Aug-2016', 'Jun-2015', 'Jan-2012', 'May-2016', 'Jul-2013', 'Feb-2014', 'Mar-2013', 'Mar-2012', 'Aug-2013', 'May-2012', 'Dec-2012', 'Dec-2011', 'Apr-2012', 'Oct-2011', 'Nov-2011', 'Aug-2011', 'Jan-2013', 'Sep-2011', 'Jul-2011', 'Jun-2011', 'May-2011', 'Apr-2011', 'Mar-2011', 'Feb-2011', 'Jan-2011', 'Dec-2010', 'Nov-2010', 'Oct-2010', 'Sep-2010', 'Aug-2010', 'Jul-2010', 'Jun-2010', 'Apr-2010', 'Mar-2010', 'Feb-2010', 'Aug-2007', 'May-2010', 'Dec-2009', 'Jan-2010', 'Nov-2009', 'Oct-2009', 'Jul-2009', 'Aug-2009', 'Apr-2009', 'Jun-2009', 'Feb-2009', 'Dec-2008', 'May-2009', 'May-2008', 'Mar-2009', 'Mar-2008', 'Sep-2008', 'Sep-2009', 'Feb-2008', 'Oct-2008', 'Jan-2009', 'Jan-2008', 'Oct-2007', 'Jun-2007', 'Aug-2008', 'Nov-2007', 'Sep-2007', 'May-2007'. The levels have been removed, and values have been coerced to 'NA'.
xgb_workflow_fit %>%
pull_workflow_fit() %>%
vip(10)
Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
Please use `extract_fit_parsnip()` instead.
xgb_explainer <- explain_tidymodels(
xgb_workflow_fit,
data = train ,
y = train$loan_default ,
verbose = TRUE
)
Preparation of a new explainer is initiated
-> model label : workflow ( default )
-> data : 20843 rows 52 cols
-> data : tibble converted into a data.frame
Warning: Unknown or uninitialised column: `loan_default`.
-> target variable : not specified! ( WARNING )
-> predict function : yhat.workflow will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
-> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
-> model_info : By deafult classification tasks supports only numercical 'y' parameter.
-> model_info : Consider changing to numerical vector with 0 and 1 values.
-> model_info : Otherwise I will not be able to calculate residuals or loss function.
-> predicted values : numerical, min = 0.00114727 , mean = 0.1510182 , max = 0.9787124
-> residual function : difference between y and yhat ( default )
A new explainer has been created!
pdp_grade <- model_profile(
xgb_explainer,
variables = c("grade")
)
'variable_type' changed to 'categorical' due to lack of numerical variables.
plot(pdp_grade) +
labs(title = "PDP loan GRADE",
x="grade",
y="average impact on prediction")
as_tibble(pdp_grade$agr_profiles) %>%
mutate(profile_variable = `_x_`,
avg_prediction_impact = `_yhat_`) %>%
ggplot(aes(x=profile_variable, y=avg_prediction_impact)) +
geom_col() +
labs(
x = "Variable: Loan GRADE",
y = " Average prediction Impact ",
color = NULL,
title = "Partial dependence plot Loan GRADE",
subtitle = "How does GRADE impact predictions (on average)"
)
pdp_fico <- model_profile(
xgb_explainer,
variables = c("fico_range_low")
)
plot(pdp_fico)
pdp_income <- model_profile(
xgb_explainer,
variables = c("annual_inc")
)
plot(pdp_income)
as_tibble(pdp_fico$agr_profiles) %>%
mutate(profile_variable = `_x_`,
avg_prediction_impact = `_yhat_`) %>%
ggplot(aes(x=profile_variable, y=avg_prediction_impact)) +
geom_line() +
labs(
x = "Variable: Fico Range Low",
y = " Average prediction Impact ",
color = NULL,
title = "Partial dependence plot Loan GRADE",
subtitle = "How does Fico Range Low impact predictions (on average)"
)
as_tibble(pdp_income$agr_profiles) %>%
mutate(profile_variable = `_x_`,
avg_prediction_impact = `_yhat_`) %>%
filter(profile_variable < 6000000) %>%
ggplot(aes(x=profile_variable, y=avg_prediction_impact)) +
geom_line() +
labs(
x = "Variable: Fico Range Low",
y = " Average prediction Impact ",
color = NULL,
title = "Partial dependence plot Loan GRADE",
subtitle = "How does Fico Range Low impact predictions (on average)"
)
library(DALEX)
library(DALEXtra)
xgb_explainer <- explain_tidymodels(
xgb_workflow_fit,
data = train ,
y = train$loan_status ,
verbose = TRUE
)
Preparation of a new explainer is initiated
-> model label : workflow ( default )
-> data : 20843 rows 52 cols
-> data : tibble converted into a data.frame
-> target variable : 20843 values
-> predict function : yhat.workflow will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
-> model_info : Model info detected classification task but 'y' is a factor . ( WARNING )
-> model_info : By deafult classification tasks supports only numercical 'y' parameter.
-> model_info : Consider changing to numerical vector with 0 and 1 values.
-> model_info : Otherwise I will not be able to calculate residuals or loss function.
-> predicted values : numerical, min = 0.00114727 , mean = 0.1510182 , max = 0.9787124
-> residual function : difference between y and yhat ( default )
Warning: ‘-’ not meaningful for factors
-> residuals : numerical, min = NA , mean = NA , max = NA
A new explainer has been created!
pdp_age <- model_profile(
xgb_explainer,
variables = "annual_inc"
)
pdp_income <- model_profile(
xgb_explainer,
variables = "annual_inc"
)
plot(pdp_income)
labs(title = "PDP annual_inc", x="annual_inc", y="average impact on prediction")
$x
[1] "annual_inc"
$y
[1] "average impact on prediction"
$title
[1] "PDP annual_inc"
attr(,"class")
[1] "labels"
as_tibble(pdp_income$agr_profiles) %>%
mutate(profile_variable = `_x_`,
avg_prediction_impact = `_yhat_`) %>%
filter(profile_variable < 6000000.00) %>%
ggplot(aes(x=profile_variable, y=avg_prediction_impact)) +
geom_line() +
labs(
x = "Variable: annual_inc",
y = " Average prediction Impact ",
color = NULL,
title = "Partial dependence plot Loan GRADE",
subtitle = "How does annual_inc impact predictions (on average)"
)
# speed things up!
train_sample <- train %>%
select(last_credit_pull_d, # select just the columns used in recipe
last_pymnt_amnt,
term,
last_pymnt_d,
total_rec_late_fee,
installment,
annual_inc,
inq_last_6mths,
funded_amnt_inv) %>%
sample_frac(0.1) # take a 10% sample or less
xgb_explainer <- explain_tidymodels(
xgb_workflow_fit,
data = train_sample ,
y = train_sample$loan_status ,
verbose = TRUE
)
Preparation of a new explainer is initiated
-> model label : workflow ( default )
-> data : 2084 rows 9 cols
-> data : tibble converted into a data.frame
Warning: Unknown or uninitialised column: `loan_status`.
-> target variable : not specified! ( WARNING )
-> predict function : yhat.workflow will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
-> model_info : Model info detected classification task but 'y' is a NULL . ( WARNING )
-> model_info : By deafult classification tasks supports only numercical 'y' parameter.
-> model_info : Consider changing to numerical vector with 0 and 1 values.
-> model_info : Otherwise I will not be able to calculate residuals or loss function.
-> predicted values : the predict_function returns an error when executed ( WARNING )
-> residual function : difference between y and yhat ( default )
A new explainer has been created!
# you should use TEST not training for this!
score_test %>% head()
# Top 5 TP highest scoring defaults
top_5_tp <- score_test %>%
filter(.pred_class == loan_status) %>%
filter(loan_status != 1) %>%
slice_max(order_by = .pred_1, n=5)
# Top 5 FP highest scoring defaults
top_5_fp <- score_test %>%
filter(.pred_class == loan_status) %>%
filter(loan_status != 1) %>%
slice_max(order_by = .pred_1, n=5)
# Bottom 5 FN lowest scoring defaults
bottom_5_fn <- score_test %>%
filter(.pred_class == loan_status) %>%
filter(loan_status == 1) %>%
slice_min(order_by = .pred_1, n=5)
explain_prediction <- function(top_5_tp){
# step 1. run the explainer
record_shap <- predict_parts(explainer = xgb_explainer,
new_observation = top_5_tp,
type="shap")
# step 2. get a predicted probability for plot
prediction_prob <- top_5_tp[,".pred_1"] %>%
mutate(.pred_default = round(.pred_1,3)) %>%
pull()
# step 3. plot it.
# you notice you don't get categorical values ...
record_shap %>%
plot() +
labs(title=paste("SHAP Explainer:",prediction_prob),
x = "shap importance",
y = "record") -> shap_plot
print(shap_plot)
}
# example TP 5 records
for (row in 1:nrow(top_5_tp)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
}
loan_sample <- train %>% sample_n(1000)
loans_explainer <- explain_tidymodels(
xgb_workflow_fit, # fitted workflow object
data = loan_sample, # original training data
y = loan_sample$loan_status, # predicted outcome
label = "xgboost",
verbose = FALSE
)
Warning: ‘-’ not meaningful for factors
explain_prediction <- function(single_record){
# step 3. run the explainer
record_shap <- predict_parts(explainer = loans_explainer,
new_observation = single_record,
#type="fastshap"
)
# step 4. plot it.
# you notice you don't get categorical values ...
record_shap %>% plot() %>% print()
# --- more involved explanations with categories. ----
# step 4a.. convert breakdown to a tibble so we can join it
record_shap %>%
as_tibble() -> shap_data
# step 4b. transpose your single record prediction
single_record %>%
gather(key="variable_name",value="value") -> prediction_data
# step 4c. get a predicted probability for plot
prediction_prob <- single_record[,".pred_1"] %>% mutate(.pred_1 = round(.pred_1,3)) %>% pull()
# step 5. plot it.
shap_data %>%
inner_join(prediction_data) %>%
mutate(variable = paste(variable_name,value,sep = ": ")) %>%
group_by(variable) %>%
summarize(contribution = mean(contribution)) %>%
mutate(contribution = round(contribution,3),
sign = if_else(contribution < 0, "neg","pos")) %>%
ggplot(aes(y=reorder(variable, contribution), x= contribution, fill=sign)) +
geom_col() +
geom_text(aes(label=contribution))+
labs(
title = "SHAPLEY explainations",
subtitle = paste("predicted probablity = ",prediction_prob) ,
x="contribution",
y="features")
}
# -- score training
scored_train <- predict(xgb_workflow_fit, train, type="prob") %>%
bind_cols(predict(xgb_workflow_fit, train, type="class")) %>%
bind_cols(.,train)
# -- score testing
scored_test <- predict(xgb_workflow_fit, test, type="prob") %>%
bind_cols(predict(xgb_workflow_fit, test, type="class")) %>%
bind_cols(.,test)
top_5_tp <- scored_test %>%
filter(.pred_class == loan_status) %>%
filter(loan_status == 1) %>%
slice_max(.pred_1,n=5)
top_5_fp <- scored_test %>%
filter(.pred_class != loan_status) %>%
filter(loan_status == 1) %>%
slice_max(.pred_1,n=5)
top_5_fn <- scored_test %>%
filter(.pred_class != loan_status ) %>%
filter(loan_status == 1) %>%
slice_max(.pred_1,n=5)
# repeat for FP and FN
for (row in 1:nrow(top_5_tp)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
}
[WARNING] Deprecated: --self-contained. use --embed-resources --standalone
for (row in 1:nrow(top_5_fp)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
}
for (row in 1:nrow(top_5_fn)) {
s_record <- top_5_tp[row,]
explain_prediction(s_record)
}